Simpson's Paradox

Use admission_data.csv for this exercise.

In [84]:
# Load and view first few lines of dataset
import pandas as pd 
import numpy as np 

df = pd.read_csv("admission_data.csv") 
df.head()
Out[84]:
student_id gender major admitted
0 35377 female Chemistry False
1 56105 male Physics True
2 31441 female Chemistry False
3 51765 male Physics True
4 53714 female Physics True

Proportion and admission rate for each gender

In [93]:
df.groupby([ 'gender']).admitted.value_counts()
Out[93]:
gender  admitted
female  False       183
        True         74
male    False       125
        True        118
Name: admitted, dtype: int64
In [94]:
74/(74+183)
Out[94]:
0.28793774319066145
In [85]:
# Proportion of students that are female
rows = df.shape[0] 
(df.gender.value_counts()/rows)[1]
Out[85]:
0.48599999999999999
In [86]:
# Proportion of students that are male
(df.gender.value_counts()/rows)[0]
Out[86]:
0.51400000000000001
In [87]:
# Admission rate for females
len(df[(df.admitted==True)&(df.gender=='female')])/len(df[df.gender=='female'])
Out[87]:
0.28793774319066145
In [89]:
# Admission rate for males
len(df[(df.admitted==True)&(df.gender=='male')])/len(df[df.gender=='male'])
Out[89]:
0.48559670781893005
In [81]:
total_females = df.gender.value_counts().female
total_males = df.gender.value_counts().male
print(total_females) 
print(total_males) 
print(df.shape) 
257
243
(500, 4)

Proportion and admission rate for physics majors of each gender

In [26]:
# What proportion of female students are majoring in physics?
physics = df[df.major=='Physics']
print(physics.shape)
print(physics.gender.value_counts())
print()
print(physics.gender.value_counts().female / total_females)
(256, 4)
male      225
female     31
Name: gender, dtype: int64

0.120622568093
In [21]:
# What proportion of male students are majoring in physics?
physics.gender.value_counts().male / total_males
Out[21]:
0.92592592592592593
In [98]:
total_admitted_for_physics = physics[(physics.admitted==True)].admitted.sum()
females_admitted_for_physics = len(physics[(physics.admitted==True) & (physics.gender=='female')])
males_admitted_for_physics = len(physics[(physics.admitted==True) & (physics.gender=='male')])
In [100]:
# Admission rate for female physics majors
females_admitted_for_physics / total_admitted_for_physics 
Out[100]:
0.16546762589928057
In [101]:
# Admission rate for male physics majors
males_admitted_for_physics / total_admitted_for_physics
Out[101]:
0.83453237410071945
In [110]:
physics.groupby('gender').admitted.value_counts()
Out[110]:
gender  admitted
female  True         23
        False         8
male    True        116
        False       109
Name: admitted, dtype: int64
In [108]:
23/31
Out[108]:
0.7419354838709677

Proportion and admission rate for chemistry majors of each gender

In [44]:
chem = df[df.major=='Chemistry']
print(chem.shape)
print(chem.gender.value_counts())
print()
(244, 4)
female    226
male       18
Name: gender, dtype: int64

In [46]:
# What proportion of female students are majoring in chemistry?
chem.gender.value_counts().female / df.gender.value_counts().female
Out[46]:
0.87937743190661477
In [48]:
# What proportion of male students are majoring in chemistry?
chem.gender.value_counts().male / df.gender.value_counts().male
Out[48]:
0.07407407407407407
In [105]:
chem.groupby('gender').admitted.value_counts()
Out[105]:
gender  admitted
female  False       175
        True         51
male    False        16
        True          2
Name: admitted, dtype: int64
In [114]:
# Admission rate for female chemistry majors
51 /(51+175)
Out[114]:
0.22566371681415928
In [115]:
# Admission rate for male chemistry majors
2 /18
Out[115]:
0.1111111111111111

Admission rate for each major

In [113]:
df.groupby(['major', 'admitted']).gender.value_counts()
Out[113]:
major      admitted  gender
Chemistry  False     female    175
                     male       16
           True      female     51
                     male        2
Physics    False     male      109
                     female      8
           True      male      116
                     female     23
Name: gender, dtype: int64
In [74]:
# Admission rate for physics majors
len(df[(df.admitted==True) & (df.major=='Physics')]) / len(df[df.admitted==True])
Out[74]:
0.7239583333333334
In [76]:
# Admission rate for chemistry majors
len(df[(df.admitted==True) & (df.major=='Chemistry')]) / len(df[df.admitted==True])
Out[76]:
0.2760416666666667
In [ ]: